what are the model assumptions for?
for statistical inference:
hypothesis testing
confidence intervals
Inference on \(\hat{\beta}_{1}\) : hypothesis testing
The null hypothesis is \(\beta_{1}=0\) and the test statistic is
\[\frac{\hat{\beta}_{1}-\beta_{1}}{SE(\hat{\beta}_{1})}\]
that, under the null hypothesis becomes just \(\frac{\hat{\beta}_{1}}{SE(\hat{\beta}_{1})}\) distributed as a Student t with n-2 d.f.
\[\text{Reject } H_{0} \text{ if } \left|\frac{\hat{\beta}_{1}}{SE(\hat{\beta}_{1})}\right|>t_{1-\frac{\alpha}{2},n-2}\]
or look at the p-value
\[pvalue = 2\times P\left(t_{n-1}>\left|\frac{\hat{\beta}_{1}}{SE(\hat{\beta}_{1})}\right|\right)\]
the lower the p-value, the stronger the evidence against the null hypothesis
Linear regression: multiple predictors
In algebraic form
\[{\bf y}={\bf X}{\bf \beta}+{\bf \epsilon}\]
\[\begin{bmatrix}
y_{1}\\
y_{2}\\
y_{3}\\
\vdots\\
y_{n}
\end{bmatrix}=\begin{bmatrix}
1& x_{1,1}&\ldots&x_{1,p}\\
1& x_{2,1}&\ldots&x_{2,p}\\
1& x_{3,1}&\ldots&x_{3,p}\\
\vdots&\vdots&\vdots&\\
1& x_{n,1}&\ldots&x_{n,p}\\
\end{bmatrix}\begin{bmatrix}
\beta_{0}\\
\beta_{1}\\
\vdots\\
\beta_{p}\\
\end{bmatrix}+\begin{bmatrix}
\epsilon_{1}\\
\epsilon_{2}\\
\epsilon_{3}\\
\vdots\\
\epsilon_{n}
\end{bmatrix}\]
Linear regression: ordinary least square (OLS)
OLS target
\[\min_{\hat{\beta}_{0},\hat{\beta}_{1},\ldots,\hat{\beta}_{p}}\left(\sum_{i=1}^{n}y_{i}-\hat{\beta}_{0}-\sum_{j=1}^{p}X_{j}\beta_{j}\right)^{2}=\min_{\hat{\beta}_{0},\hat{\beta}_{1},\ldots,\hat{\beta}_{p}}RSS\]
OLS target (algebraic)
\[ \min_{\hat{\beta}} ({\bf y}-{\bf X}{\bf \hat{\beta}})^{\sf T}({\bf y}-{\bf X}{\bf \hat{\beta}})\]
Linear regression: ordinary least square (OLS) solution
OLS target \[\min_{\hat{\beta}}({\bf y}-{\bf X}{\bf \hat{\beta}})^{\sf T}({\bf y}-{\bf X}{\bf \hat{\beta}})=\min_{\hat{\beta}}\left({\bf y}^{\sf T}{\bf y}-{\bf y}^{\sf T}{\bf X} {\bf \hat{\beta}}-{\bf{\hat{\beta}}}^{\sf T}{\bf X}^{\sf T}{\bf y}+{\bf {\hat{\beta}}}^{\sf T}{\bf X}^{\sf T}{\bf X} {\bf \hat{\beta}}\right)\]
First order conditions
\[\partial_{\hat{\beta}}\left({\bf y}^{\sf T}{\bf y}-{\bf y}^{\sf T}{\bf X} {\bf \hat{\beta}}-{\bf{\hat{\beta}}}^{\sf T}{\bf X}^{\sf T}{\bf y}+{\bf {\hat{\beta}}}^{\sf T}{\bf X}^{\sf T}{\bf X} {\bf \hat{\beta}}\right)=0\]
Since \(\partial_{\bf\hat{\beta}}\left({\bf y}^{\sf T}{\bf X} {\bf \hat{\beta}}\right)=\partial_{\bf\hat{\beta}}\left({ \bf {\hat{\beta}}}^{\sf T}{\bf X}^{\sf T}{\bf y}\right)={\bf X}^{\sf T}{\bf y}\) it results that
\[\partial_{\hat{\beta}}RSS=-2{\bf X}^{\sf T}{\bf y}+2{\bf X}^{\sf T}{\bf X} {\bf \hat{\beta}}=0\]
And the OLS solution is
\[ \hat{\beta} = ({\bf X}^{\sf T}{\bf X})^{-1}{\bf X}^{\sf T}{\bf y} \]
Advertising data
Code
adv_data = read_csv (file= "./data/Advertising.csv" ) %>% select (- 1 )
adv_data %>% slice_sample (n = 8 ) %>%
kbl ()
76.4
0.8
14.8
9.4
66.9
11.7
36.8
9.7
265.6
20.0
0.3
17.4
151.5
41.3
58.5
18.5
23.8
35.1
65.9
9.2
237.4
27.5
11.0
18.9
7.8
38.9
50.6
6.6
197.6
23.3
14.2
16.6
sales is the response, indicating the level of sales in a specific market
TV, Radio and Newspaper are the features, indicating the advertising budget spent on the corresponding media
Advertising data: tidy
Code
adv_data_tidy = adv_data %>% pivot_longer (names_to= "medium" ,values_to= "budget" ,cols = 1 : 3 )
adv_data_tidy %>% slice_sample (n = 8 ) %>%
kbl ()
14.8
radio
10.1
19.4
radio
33.5
10.5
radio
16.0
11.8
newspaper
23.7
11.7
newspaper
5.9
19.2
TV
193.7
16.9
TV
163.3
16.6
TV
202.5
Advertising data: visualization
Code
adv_data_tidy %>%
ggplot (aes (x = budget, y = sales)) + theme_minimal () + facet_wrap (~ medium,scales = "free" ) +
geom_point (color= "indianred" ,alpha= .5 ,size= 3 )
Note : the budget spent on TV is up to 300, less so for Radio and Newspaper
Advertising data: single regressions
We can be naive and regress sales on tv, newspaper and radio, separately.
Code
avd_lm_models_nest = adv_data_tidy %>% #<<
group_by (medium) %>% #<<
group_nest (.key = "datasets" ) %>% #<<
mutate (
model_output = map (.x= datasets,~ lm (sales~ budget,data= .x)),
model_params = map (.x= model_output, ~ tidy (.x)),
model_metrics = map (.x= model_output, ~ glance (.x)),
model_fitted = map (.x= model_output, ~ augment (.x))
)
avd_lm_models_nest
# A tibble: 3 × 6
medium datasets model_output model_params model_metrics model_fitted
<chr> <list<tibble[,> <list> <list> <list> <list>
1 TV [200 × 2] <lm> <tibble> <tibble> <tibble>
2 newspaper [200 × 2] <lm> <tibble> <tibble> <tibble>
3 radio [200 × 2] <lm> <tibble> <tibble> <tibble>
this is a nested structure, the columns are lists
datasets
model_output the whole output object (produced by lm())
model_params coefficient estimates, test statistics and p-values
model_metrics goodness of fit
model_fitted point-wise estimates, confidence and prediction bands
Three regression lines
Code
three_regs_plot= avd_lm_models_nest %>% unnest (model_fitted) %>% #<<
select (medium,sales: .fitted) %>% #<<
ggplot (aes (x= budget,y= sales))+ theme_minimal ()+ facet_grid (~ medium,scales= "free" )+ #<<
geom_point (alpha= .5 ,color = "indianred" )+
geom_smooth (method= "lm" )+
geom_segment (aes (x= budget,xend= budget,y= .fitted,yend= sales),color= "forestgreen" ,alpha= .25 )
three_regs_plot
Advertising data: nested structure
The quantities nested in the tibble can be pulled out, or they can be expanded within the tibble itself \(\texttt{unnest}\)
Code
avd_lm_models_nest %>% unnest (model_params) %>%
dplyr:: select (medium,term: p.value) %>%
kbl (digits = 4 ) %>%
row_spec (1 : 2 ,background = "#D9FDEC" ) %>%
row_spec (3 : 4 ,background = "#CAF6FC" ) %>%
row_spec (5 : 6 ,background = "#FDB5BA" )
TV
(Intercept)
7.0326
0.4578
15.3603
0.0000
TV
budget
0.0475
0.0027
17.6676
0.0000
newspaper
(Intercept)
12.3514
0.6214
19.8761
0.0000
newspaper
budget
0.0547
0.0166
3.2996
0.0011
radio
(Intercept)
9.3116
0.5629
16.5422
0.0000
radio
budget
0.2025
0.0204
9.9208
0.0000
Code
avd_lm_models_nest %>% unnest (model_metrics) %>%
dplyr:: select (medium,r.squared: p.value) %>%
kbl (digits = 4 ) %>%
row_spec (1 ,background = "#D9FDEC" ) %>%
row_spec (2 ,background = "#CAF6FC" ) %>%
row_spec (3 ,background = "#FDB5BA" )
TV
0.6119
0.6099
3.2587
312.1450
0.0000
newspaper
0.0521
0.0473
5.0925
10.8873
0.0011
radio
0.3320
0.3287
4.2749
98.4216
0.0000
Advertising data: multiple regression
Code
adv_mreg= lm (sales~ .,data= adv_data)
adv_mreg |> tidy () |>
kbl (digits = 4 ) %>%
row_spec (1 ,background = "white" ) %>%
row_spec (2 ,background = "#D9FDEC" ) %>%
row_spec (3 ,background = "#CAF6FC" ) %>%
row_spec (4 ,background = "#FDB5BA" )
(Intercept)
2.9389
0.3119
9.4223
0.0000
TV
0.0458
0.0014
32.8086
0.0000
radio
0.1885
0.0086
21.8935
0.0000
newspaper
-0.0010
0.0059
-0.1767
0.8599
Advertising data: multiple regression vs simple
multiple regression estimates
Code
adv_mreg= lm (sales~ .,data= adv_data)
adv_mreg |> tidy () |> filter (term!= "(Intercept)" ) |>
kbl (digits = 4 ) %>% kable_styling (font_size = 12 ) |>
row_spec (1 ,background = "#D9FDEC" ) %>%
row_spec (2 ,background = "#FDB5BA" ) %>%
row_spec (3 ,background = "#CAF6FC" )
TV
0.0458
0.0014
32.8086
0.0000
radio
0.1885
0.0086
21.8935
0.0000
newspaper
-0.0010
0.0059
-0.1767
0.8599
single regression estimates
Code
avd_lm_models_nest %>% unnest (model_params) %>%
dplyr:: select (medium,term: p.value) |> filter (term!= "(Intercept)" ) |>
kbl (digits = 4 ) %>% kable_styling (font_size = 12 ) |>
row_spec (1 ,background = "#D9FDEC" ) %>%
row_spec (2 ,background = "#CAF6FC" ) %>%
row_spec (3 ,background = "#FDB5BA" )
TV
budget
0.0475
0.0027
17.6676
0.0000
newspaper
budget
0.0547
0.0166
3.2996
0.0011
radio
budget
0.2025
0.0204
9.9208
0.0000
they are not the same because some of the features are correlated with one another
Improving models
sub-set selection : reduce the number of predictors to compress the model variance and improve interpretability.
shrinkage methods : the \(p\) predictors are kept in the model, the estimates of the coefficient are shrunken towards 0. This also compresses the variance.
dimension reduction : \(M\) syntheric predictors are defined that are linear combinations of the starting \(p\) variables (PCA anyone?), setting \(M<p\) the model complexity is reduced
best subset selection
This approach uses the complete search space of all the possible models one can obtain starting from \(p\) predictors ( \(2^{p}\) ).
\(\mathcal{M}_{0}\) is the null model (intercept only).
For \(k=1,\ldots,p\)
fit \(p\choose k\) possible models on \(k\) predictors
find \(\mathcal{M}_{k}\) , the best model with \(k\) predictors: lowest RSS or largest \(R^{2}\) .
From the sequence \(\mathcal{M}_{0},\mathcal{M}_{1},\ldots,\mathcal{M}_{p}\) of best models given the number of predictors the overall best is chosen
due to the different degrees of freedom, a test error estimate is required to make the choice.
or, one can use some training error measures adjusted for the model complexity. (Mallow \(C_{p}\) , AIC , the BIC , the adjusted \(R^{2}\) ).
best subset selection
best sub-set selection: credit dataset
Code
magick:: image_read_pdf ("./figures/BestSubset.pdf" ,pages = 1 )
The response is balance , 11 predictors with two dummy for ethnicity .
each \(\color{white!50!black}{\text{gray}}\) point is a model, with \(x\) predictors, the y’s are RSS (sx) and \(R^{2}\) (dx).
each \(\color{red}{\text{red}}\) point is the best model with \(x\) predictors.
Cons best subset selection
computational complexity : \(2^{p}\) models must be fitted, in the previous example 1024 models have been fitted. With 20 predictors, one needs to fit more than a million models (1048576)!
statistical problem : due to the high number of fitted models, it is likely that one model randomly fits well, or even overfit, the training set
forward stepwise selection
\(\mathcal{M}_{0}\) is the null model (intercept only)
For \(k=0,\ldots,p-1\)
fit the \(p - k\) possible models that add a further predictor to the model \(\mathcal{M}_{k}\) ;
find the best there is and define it \(\mathcal{M}_{k+1}\) .
From the sequence \(\mathcal{M}_{0},\mathcal{M}_{1},\ldots,\mathcal{M}_{p}\) of best models given the number of predictors, the overall best is chosen
due to the different degrees of freedom, a test error estimate is required to make the choice.
or, one can use some training error measures adjusted for the model complexity. (Mallow \(C_{p}\) , AIC , BIC , adjusted \(R^{2}\) )
backward stepwise selection
fit \(\mathcal{M}_{p}\) , the full model .
For \(k=p, p-1,\ldots,1\)
fit all the possible models that contain all the predictors in \(\mathcal{M}_{k}\) but one;
find the best among the \(k\) models, that becomes \(\mathcal{M}_{k-1}\) .
From the sequence \(\mathcal{M}_{0},\mathcal{M}_{1},\ldots,\mathcal{M}_{p}\) of best models given the number of predictors, the overall best is chosen
due to the different degrees of freedom, a test error estimate is required to make the choice.
or, one can use some training error measures adjusted for the model complexity. (Mallow \(C_{p}\) , AIC , BIC , adjusted \(R^{2}\) )
the stepwise approach
the number of models to evaluate is \(1+p(p+1)/2\) , smaller than \(2^{p}\)
the reduced search space does not guarantee to find the best possible model
the backward selection cannot be applied when \(p>n\)
selecting the best model irrespective the size
adjust the training error measure
add some price to pay for each extra-predictor in the model
use cross-validation to estimate the test error
estimate the performance of the model on the test set: e.g. via cross-validation
Adjusting the training error
Mallow \(C_{p} = \frac{1}{n} (RSS+2d\hat{\sigma}^{2})\)
AIC \(= -2\log(L) +2d\)
BIC \(= \frac{1}{n} (RSS+\log(n)d\hat{\sigma}^{2})\)
Adjusted \(R^{2}= 1-\frac{RSS/(n-d-1)}{TSS/(n-1)}\)
\(d\) is the number of parameters in the model;
\(\hat{\sigma}^{2}\) is the estimate of the error \(\epsilon\) variance;
\(L\) is the max of the likelihood function;
for linear models, under normal assumptions, \(L\) and \(RSS\) coincide, thus \(C_{p} = AIC\)
\(C_{p}\) and \(BIC\) differ in \(2\) being replaced by \(\log(n)\) in BIC: since, for \(n>7\) , \(\log(n)>2\) , the \(BIC\) tends to select a lower number of predictors;
the adjusted \(R^{2}\) penalizes \(RSS\) by \(n-d-1\) , that increases with the number of parameters in the model.
Credit data example: adjusting the training error
Using \(C_{p}\) and Adjusted-\(R^{2}\) the choice is 6 and 7 predictors, respectively.The \(BIC\) leads to choose 4 predictors.
Cross-validation selection
\(k\) -fold cross-validation can be used to estimate the test error and choose the best model;
an advantage is that \(\hat{\sigma}\) and \(d\) are not required (and they are sometimes not easy to identify);
this approach can be applied to any type of model
Credit data example: BIC vs validation approaches
Subset selection
note that \(\texttt{leaps::regsubsets}\) does not have a pre-defined \(\texttt{predict}\) function, nor has it \(\texttt{broom::augment}\) .
selection is done via the provided corrected training error measures: \(C_{p}\) , \(AIC\) , \(BIC\) , \(Adjusted \ R^{2}\)
An ad-hoc procedure is needed for cross-validation based selection.
Subset selection example: hitters dataset
basic cleaning
Remove missings and clean the names
Code
data (Hitters,package = "ISLR2" )
my_hitters = Hitters %>% na.omit () %>%
as_tibble () %>% clean_names ()
set.seed (123 )
main_split= initial_split (my_hitters, prop= 4 / 5 )
hit_train= training (main_split)
hit_test= testing (main_split)
hit_flds = vfold_cv (data = hit_train,v= 10 )
subset selection example: hitters dataset
data splitting
keep the test observations for evaluation, arrange the training observations in folds, for cross-validation
Code
data (Hitters,package = "ISLR2" )
my_hitters = Hitters %>% na.omit () %>%
as_tibble () %>% clean_names ()
set.seed (123 )
main_split= initial_split (my_hitters, prop= 4 / 5 )
hit_train= training (main_split)
hit_test= testing (main_split)
hit_flds = vfold_cv (data = hit_train,v= 5 )
subset selection example: hitters dataset
To access the output of \(\texttt{regsubsets}\) one can use
Code
reg_sub_out = regsubsets (salary~ .,data= hit_train,
nvmax = 19 ,method = "exhaustive" )
reg_sub_out %>% tidy %>% kbl () %>% kable_styling (font_size = 10 )
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
0.3945540
0.3916432
-94.68167
82.632753
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
0.4793254
0.4742947
-121.01098
44.219902
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
0.5147955
0.5077294
-130.48038
29.310338
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
FALSE
0.5417171
0.5327750
-137.12086
18.476068
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
FALSE
FALSE
FALSE
0.5600773
0.5492949
-140.36017
11.723250
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
TRUE
TRUE
FALSE
FALSE
FALSE
0.5669372
0.5541373
-138.31349
10.452933
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
0.5722116
0.5573872
-135.53972
9.938504
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
TRUE
TRUE
TRUE
TRUE
FALSE
0.5777671
0.5609618
-132.93765
9.290045
TRUE
FALSE
FALSE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.5819960
0.5631858
-129.70443
9.274000
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
0.5875981
0.5668744
-127.19075
8.603345
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
0.5928848
0.5702673
-124.55313
8.083007
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
TRUE
FALSE
TRUE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.5967606
0.5721978
-121.21484
8.235300
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.5986463
0.5720259
-116.85207
9.336337
TRUE
TRUE
TRUE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
FALSE
FALSE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.5998215
0.5710908
-112.12077
10.776077
TRUE
TRUE
TRUE
FALSE
FALSE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.6005753
0.5696920
-107.16960
12.416728
TRUE
TRUE
TRUE
FALSE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.6009755
0.5678958
-102.03301
14.225938
TRUE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.6012686
0.5659643
-96.84021
16.086206
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
FALSE
TRUE
TRUE
TRUE
TRUE
TRUE
0.6014386
0.5638778
-91.58262
18.005195
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
TRUE
0.6014495
0.5615944
-86.24126
20.000000
subset selection example: hitters dataset
It is handy to define a model-size variable: this is easily done by counting the boolean variables row-wise
Code
model_size_vs_perf = reg_sub_out %>% tidy %>%
rowwise () %>%
mutate (n_predictors = sum (
c_across ("(Intercept)" : "new_leagueN" )
)- 1 ,.keep= "unused" ) %>%
arrange (n_predictors) %>% ungroup ()
0.3945540
0.3916432
-94.68167
82.632753
1
0.4793254
0.4742947
-121.01098
44.219902
2
0.5147955
0.5077294
-130.48038
29.310338
3
0.5417171
0.5327750
-137.12086
18.476068
4
0.5600773
0.5492949
-140.36017
11.723250
5
0.5669372
0.5541373
-138.31349
10.452933
6
0.5722116
0.5573872
-135.53972
9.938504
7
0.5777671
0.5609618
-132.93765
9.290045
8
0.5819960
0.5631858
-129.70443
9.274000
9
0.5875981
0.5668744
-127.19075
8.603345
10
0.5928848
0.5702673
-124.55313
8.083007
11
0.5967606
0.5721978
-121.21484
8.235300
12
0.5986463
0.5720259
-116.85207
9.336337
13
0.5998215
0.5710908
-112.12077
10.776077
14
0.6005753
0.5696920
-107.16960
12.416728
15
0.6009755
0.5678958
-102.03301
14.225938
16
0.6012686
0.5659643
-96.84021
16.086206
17
0.6014386
0.5638778
-91.58262
18.005195
18
0.6014495
0.5615944
-86.24126
20.000000
19
subset selection example: hitters dataset
For each index, create a boolean variable indicating the best value
Code
model_size_vs_perf_w_best = model_size_vs_perf %>%
mutate (
across (.cols= 1 : 2 , ~ .x== max (.x),.names= "{.col}" ),
across (.cols= 3 : 4 , ~ .x== min (.x),.names= "{.col}" )
) %>%
select (n_predictors,everything ())
model_size_vs_perf_best= model_size_vs_perf_w_best|>
pivot_longer (names_to = "index" , values_to = "best" ,cols= "r.squared" : "mallows_cp" )
model_size_vs_perf = model_size_vs_perf %>%
pivot_longer (names_to = "index" , values_to = "value" ,cols= "r.squared" : "mallows_cp" ) %>%
bind_cols (model_size_vs_perf_best %>% select (best))
1
FALSE
FALSE
FALSE
FALSE
2
FALSE
FALSE
FALSE
FALSE
3
FALSE
FALSE
FALSE
FALSE
4
FALSE
FALSE
FALSE
FALSE
5
FALSE
FALSE
TRUE
FALSE
6
FALSE
FALSE
FALSE
FALSE
7
FALSE
FALSE
FALSE
FALSE
8
FALSE
FALSE
FALSE
FALSE
subset selection example: hitters dataset
convert the table in long format
Code
model_size_vs_perf_w_best = model_size_vs_perf %>%
mutate (
across (.cols= 1 : 2 , ~ .x== max (.x),.names= "{.col}" ),
across (.cols= 3 : 4 , ~ .x== min (.x),.names= "{.col}" )
) %>%
select (n_predictors,everything ())
model_size_vs_perf_best = model_size_vs_perf_w_best|>
pivot_longer (names_to = "index" , values_to = "best" ,cols= "r.squared" : "mallows_cp" )
model_size_vs_perf = model_size_vs_perf %>%
pivot_longer (names_to = "index" , values_to = "value" ,cols= "r.squared" : "mallows_cp" ) %>%
bind_cols (model_size_vs_perf_best %>% select (best))
1
r.squared
FALSE
1
adj.r.squared
FALSE
1
BIC
FALSE
1
mallows_cp
FALSE
2
r.squared
FALSE
2
adj.r.squared
FALSE
2
BIC
FALSE
2
mallows_cp
FALSE
subset selection example: hitters dataset
create a further long table with all the values for different index and model-size
Code
model_size_vs_perf_best = model_size_vs_perf %>%
mutate (
across (.cols= 1 : 2 , ~ .x== max (.x),.names= "{.col}" ),
across (.cols= 3 : 4 , ~ .x== min (.x),.names= "{.col}" )
) %>%
select (n_predictors,everything ())
model_size_vs_perf_best= model_size_vs_perf_best|>
pivot_longer (names_to = "index" , values_to = "best" ,cols= "r.squared" : "mallows_cp" )
model_size_vs_perf = model_size_vs_perf %>%
pivot_longer (names_to = "index" , values_to = "value" ,cols= "r.squared" : "mallows_cp" ) %>%
bind_cols (model_size_vs_perf_best %>% select (best))
subset selection example: hitters dataset
Finally, create a plot to summarize the information at hand
Code
model_size_vs_perf %>% ggplot (aes (x= n_predictors,y= value))+ geom_line ()+ geom_point (aes (color= best))+ facet_wrap (~ index,scales = "free" )
subset selection example: hitters dataset
List-wise setup
Create a training and a validation set for each combination of folds
Code
tidy_structure = hit_flds |>
mutate (
train = map (.x= splits, ~ training (.x)),
validate = map (.x= splits, ~ testing (.x))
)
# 10-fold cross-validation
# A tibble: 10 × 4
splits id train validate
<list> <chr> <list> <list>
1 <split [189/21]> Fold01 <tibble [189 × 20]> <tibble [21 × 20]>
2 <split [189/21]> Fold02 <tibble [189 × 20]> <tibble [21 × 20]>
3 <split [189/21]> Fold03 <tibble [189 × 20]> <tibble [21 × 20]>
4 <split [189/21]> Fold04 <tibble [189 × 20]> <tibble [21 × 20]>
5 <split [189/21]> Fold05 <tibble [189 × 20]> <tibble [21 × 20]>
6 <split [189/21]> Fold06 <tibble [189 × 20]> <tibble [21 × 20]>
7 <split [189/21]> Fold07 <tibble [189 × 20]> <tibble [21 × 20]>
8 <split [189/21]> Fold08 <tibble [189 × 20]> <tibble [21 × 20]>
9 <split [189/21]> Fold09 <tibble [189 × 20]> <tibble [21 × 20]>
10 <split [189/21]> Fold10 <tibble [189 × 20]> <tibble [21 × 20]>
subset selection example: hitters dataset
best subsets on the different folds
Apply \(\text{regsubsets}\) to each of the training folds previously defined
for best subset selection, choose \(\texttt{method="exhaustive"}\)
Code
tidy_structure = tidy_structure %>%
mutate (
model_fits = map (.x= train,
.f= ~ regsubsets (salary~ .,
data= .x,nvmax = 19 ,method = "exhaustive" ))
)
Now the predictors have to be pulled out from the different sized models
subset selection example: hitters dataset
pull information from each model sequence
create the matrix \(\bf{X}_{test}\) , that contains the predictor values for the test observations
pull the coefficient estimates for each model
Code
tidy_structure = tidy_structure %>%
mutate (
x_test = map (.x= validate,
.f= ~ model.matrix (as.formula ("salary~." ),as.data.frame (.x))),
coefficients= map (.x= model_fits,~ coef (.x,id= 1 : 19 ))
) %>%
unnest (coefficients) %>%
mutate (
x_test = map2 (.x= x_test, .y= coefficients, ~ .x[,names (.y)])
)
subset selection example: hitters dataset
obtain the predictions: \({\bf \hat{y}}_{test}={\bf X}_{test}{\bf \hat{\beta}}\) - compute the squared residuals
Code
tidy_structure = tidy_structure %>%
mutate (yhat = map2 (.x= x_test, .y= coefficients, ~ .x %*% .y),
squared_resids = map2 (.x= yhat, .y= validate, ~ (.x- .y$ salary)^ 2 )) %>%
unnest (c (yhat,squared_resids)) %>%
select (id,validate,coefficients, yhat, squared_resids)
# A tibble: 3,990 × 5
id validate coefficients yhat[,1] squared_resids[,1]
<chr> <list> <list> <dbl> <dbl>
1 Fold01 <tibble [21 × 20]> <dbl [2]> 539. 83530.
2 Fold01 <tibble [21 × 20]> <dbl [2]> 377. 735.
3 Fold01 <tibble [21 × 20]> <dbl [2]> 441. 6107.
4 Fold01 <tibble [21 × 20]> <dbl [2]> 399. 123458.
5 Fold01 <tibble [21 × 20]> <dbl [2]> 265. 27365.
6 Fold01 <tibble [21 × 20]> <dbl [2]> 701. 112839.
7 Fold01 <tibble [21 × 20]> <dbl [2]> 619. 28541.
8 Fold01 <tibble [21 × 20]> <dbl [2]> 657. 3232.
9 Fold01 <tibble [21 × 20]> <dbl [2]> 1326. 301736.
10 Fold01 <tibble [21 × 20]> <dbl [2]> 260. 36215.
# ℹ 3,980 more rows
subset selection example: hitters dataset
compute the RMSE by fold, for each model - with the predictors, compute the squared residuals
Code
tidy_structure = tidy_structure %>%
group_by (id,coefficients) %>%
summarise (RMSE= sqrt (mean (squared_resids))) %>% ungroup () |>
mutate (model_size= map_int (coefficients,length)) %>%
group_by (model_size) %>%
summarise (cv_RMSE= mean (RMSE))
2
377.7384
3
350.9038
4
353.9554
5
337.6210
6
327.0380
7
335.2342
8
336.5313
9
337.8970
10
344.2199
11
346.6368
12
345.1199
13
337.5947
14
338.1094
15
335.1299
16
336.8434
17
336.5274
18
337.5436
19
336.8629
20
337.0753
subset selection example: hitters dataset
pick up the ‘best model’ according to cross-validation
2
377.7384
3
350.9038
4
353.9554
5
337.6210
6
327.0380
7
335.2342
8
336.5313
9
337.8970
10
344.2199
11
346.6368
12
345.1199
13
337.5947
14
338.1094
15
335.1299
16
336.8434
17
336.5274
18
337.5436
19
336.8629
20
337.0753
Code
tidy_structure %>%
ggplot (aes (x= model_size, y= cv_RMSE))+
geom_point ()+ geom_line ()+ theme_minimal ()+
geom_point (aes (x = model_size[which.min (cv_RMSE)],
y = min (cv_RMSE)),inherit.aes = FALSE ,color= "red" ,size= 4 ,alpha= .5 )
the optimal model size is 6
subset selection example: hitters dataset
Finally, the coefficients of the optimal model are
Code
best_size= tidy_structure$ model_size[which.min (tidy_structure$ cv_RMSE)]
bsb_rmse = tidy_structure$ cv_RMSE[which.min (tidy_structure$ cv_RMSE)]
best_out = regsubsets (salary~ .,data= hit_train,nvmax= best_size,method= "exhaustive" )
coef (best_out,id= best_size)
(Intercept) walks c_at_bat c_hits c_hm_run divisionW
130.9726510 5.3668175 -0.5104821 2.0262373 1.6944526 -130.3609776
put_outs
0.1461783
shrinkage methods
all the predictors stay in the model, but the coefficient estimates are constrained
the shrinkage determines a reduction of the estimates variability
eventually, some of the coefficients maybe constrained to zero, implicitly excluding the corresponding predictors from the model
the type of constraint defines the shrinkage method
ridge regression
A constraint is introduced on the optimization of the least squares function (RSS), in particular:
\[\underbrace{\sum_{i=1}^{n}{\left(y_{i}-\beta_{0}-\sum_{j=1}^{p}{\beta_{j}x_{ij}}\right)^{2}}}_{RSS} + \underbrace{\color{red}{\lambda\sum_{j=1}^{p}{\beta^{2}_{j}}}}_{constraint}\]
\(\lambda\) is the tuning parameter that determines the strength of the constraint on the estimates of \(\beta_{j}\) .
if \(\lambda=0\) then the ridge regression estimates \(\hat{\beta}^{R}_{\lambda}\) are the same as OLS \(\hat{\beta}\)
if \(\lambda\rightarrow \infty\) then \(\hat{\beta}^{R}_{j}\approx 0\)
ridge regression
estimates for different values of \(\lambda\) , in figure right the x axis shows the ratio between \(||\hat{\beta}^{R}_{\lambda}||_{2}\) and \(||\hat{\beta}||_{2}\)
the \(\mathcal{L}_2\) norm is the square root of the sum of squared coefficients \(||\hat{\beta}||_{2}=\sqrt{\sum_{j=1}^{p}{\hat{\beta}^{2}_{j}}}\)
ridge regression
Synthetic data with \(n=50\) and \(p=45\) : all of the true \(\beta\) ’s differ from 0.
The grid is on \(\lambda\) and \(||\beta^{R}||/||\beta^{R}||\) , the \(\color{black}{\text{squared bias}}\) , the \(\color{blue!20!green}{\text{variance}}\) and \(\color{violet}{\text{MSE test}}\)
the dotted line is the true MSE test
lasso regression
The lasso differs from ridge in the constraint
\[\underbrace{\sum_{i=1}^{n}{\left(y_{i}-\beta_{0}-\sum_{j=1}^{p}{\beta_{j}x_{ij}}\right)^{2}}}_{RSS} + \underbrace{\color{red}{\lambda\sum_{j=1}^{p}{|\beta_{j}|}}}_{constraint}\]
\(\lambda\) is the tuning parameter that determines the strength of the constraint on the estimates of \(\beta_{j}\) .
if \(\lambda=0\) then the ridge regression estimates \(\hat{\beta}^{R}_{\lambda}\) are the same as OLS \(\hat{\beta}\)
if \(\lambda\rightarrow \infty\) then \(\hat{\beta}^{R}_{j}\approx 0\)
lasso regression
grid on \(\lambda\) and (right) the \(\mathcal{L}_{1}\) ratio between Lasso and OLS coefficients
variables selection and the lasso
The Lasso regression can be formalized as a constrained minimisation problem
\[\sum_{i=1}^{n}{\left(y_{i}-\beta_{0}-\sum_{j=1}^{p}{\beta_{j}x_{ij}}\right)^{2}} \ \ \ \text{subject to} \color{red}{\sum_{j=1}^{p}{|\beta_{j}|\leq s}}\]
The Ridge regression can be formalized as a constrained minimisation problem
\[\sum_{i=1}^{n}{\left(y_{i}-\beta_{0}-\sum_{j=1}^{p}{\beta_{j}x_{ij}}\right)^{2}} \ \ \ \text{subject to} \color{red}{\sum_{j=1}^{p}{\beta^{2}_{j}\leq s}}\]
variables selection and the lasso
the \(\color{cyan}{\text{light blue areas}}\) are the admissible domain for \(\beta_{1}\) and \(\beta_{2}\) solutions (Lasso (left), Ridge).
the ellipses are level curves of the RSS bivariate function.
the solution is given by the most external ellipses that touches the admissible domain.
the Lasso solution may lead to a 0 coefficient, whereas the ridge maybe close to 0 but not 0.
ridge vs lasso: non null coefficients
Synthetic data with \(n=50\) and \(p=45\) : all of the true \(\beta\) ’s differ from 0
The grid is on \(\lambda\) and \(||\beta^{R}||/||\beta^{R}||\) , the squared bias, the \(\color{blue!20!green}{\text{variance}}\) and \(\color{violet}{\text{MSE test}}\)
the dotted line is the true MSE test
Ridge provides a lower MSE test
ridge vs lasso: mostly null coefficients
Synthetic data with \(n=50\) and \(p=45\) : all but two of the true \(\beta\) ’s are 0
The grid is on \(\lambda\) and \(||\beta^{R}||/||\beta^{R}||\) , the squared bias, the \(\color{blue!20!green}{\text{variance}}\) and \(\color{violet}{\text{MSE test}}\)
the dotted line is the true MSE test
lasso provides a lower MSE test
tuning the parameter \(\lambda\) : credit data
ridge regression: cross-validation estimates given a grid of values for \(\lambda\) (left); the vertical line is the optimal value
ridge regression: ridge coefficients standardized
tuning the parameter \(\lambda\) : synthetic data
lasso regression: cross-validation for values of \(||\beta^{L}||_{1}/||\beta||_{1}\) (left); the vertical line is the optimal value
lasso regression: lasso coefficients standardized. Note that the null (at a population level) coefficient are correctly identified!
shrinkage methods example: hitters pre-processing
Specify the recipe: when applying Ridge or Lasso regression, the numerical predictors have to be scaled, and categorical have to be trasformed in dummy, since \(\texttt{glmnet}\) only handles numeric predictors.
Code
hit_recipe = recipe (salary~ .,data= hit_train) %>%
step_scale (all_numeric_predictors ()) %>%
step_zv (all_numeric (), - all_outcomes ()) %>%
step_dummy (all_nominal ())
hit_recipe %>% prep () %>% juice () %>% slice (1 : 8 ) %>%
kable () %>% kable_styling (font_size= 12 )
2.004225
1.7535884
0.8086277
1.3477453
1.4343912
0.6870214
0.8679981
0.7691974
0.6801938
0.2062682
0.6473941
0.3887487
0.4559646
0.7145051
0.0203855
0.4469852
240
1
1
1
1.363986
0.9921618
0.8086277
1.1495475
1.0467179
1.3740429
2.8209937
1.5117255
1.3753920
0.4641035
1.2293949
0.9394760
0.9603501
0.2815784
0.3057825
1.1919605
240
1
0
1
1.503168
1.2921177
0.4620730
0.8720705
0.6978119
0.6870214
2.6039942
1.3081970
1.1086493
0.5543459
0.8697315
0.9848300
0.7989468
1.3762144
0.2989873
0.5959803
250
0
0
0
1.050826
0.9460148
0.4620730
1.0306287
0.8141139
0.8702271
0.4339990
0.1347499
0.1133656
0.1160259
0.1471350
0.1263433
0.1412280
0.0985524
0.3805293
0.2979901
95
0
1
0
3.674412
2.8149708
0.1155182
2.6558510
1.7445299
2.3358728
0.8679981
0.8028848
0.6718581
0.1547012
0.6898998
0.4729776
0.6254381
0.7356235
2.5278020
2.5329161
350
0
1
0
1.482291
1.4074854
0.4620730
0.6738726
0.8528813
0.1374043
3.6889917
1.9000672
1.9088773
1.0700165
1.5955976
1.5906301
0.9845607
0.6265119
0.3057825
0.5959803
235
0
1
0
3.987572
3.3225885
1.0396642
3.3693632
2.3260398
3.5725114
1.7359961
1.4962854
1.4287405
1.2505012
1.5367436
1.3606205
1.3396481
4.6249250
0.8901668
1.7879408
960
0
0
0
3.423884
3.1380002
0.5775912
3.0126071
1.9383665
4.3053343
2.6039942
2.5784955
2.5190512
0.5027788
2.9328915
1.4610472
3.5306991
1.1016754
2.5889585
2.9799013
875
0
0
0
shrinkage methods example: model specification
the model is still linear_reg , the engine to use is glmnet , that requires two parameters
Specify the grid for the hyperparameter
Code
lambda_grid <- grid_regular (penalty (),levels= 100 )
Fix the grid that has to be evaluted: otherwise glmnet will pick a grid internally
Code
ridge_spec = linear_reg (mode = "regression" , engine= "glmnet" , mixture = 0 , penalty = tune ())
lasso_spec = linear_reg (mode = "regression" , engine= "glmnet" , mixture = 1 , penalty = tune ())
shrinkage methods example: model specification
As usual, put it all together in a workflow
Code
ridge_wflow= workflow () %>%
add_recipe (hit_recipe) %>%
add_model (ridge_spec)
lasso_wflow= workflow () %>%
add_recipe (hit_recipe) %>%
add_model (lasso_spec)
shrinkage methods example: model fit
Code
ridge_results = ridge_wflow %>%
tune_grid (grid= lambda_grid,
resamples= hit_flds,
control = control_grid (verbose = FALSE , save_pred = TRUE ),
metrics = metric_set (rmse))
best_ridge = ridge_results %>% select_best ("rmse" )
lasso_results = lasso_wflow %>%
tune_grid (grid= lambda_grid,
resamples= hit_flds,
control = control_grid (verbose = FALSE , save_pred = TRUE ),
metrics = metric_set (rmse))
best_lasso = lasso_results %>% select_best ("rmse" )
shrinkage methods example: finalization
Code
final_ridge <- finalize_workflow (x = ridge_wflow, parameters = best_ridge) %>%
last_fit (main_split)
final_lasso <- finalize_workflow (x = lasso_wflow, parameters = best_lasso) %>%
last_fit (main_split)
ridge_rmse = final_ridge %>% collect_metrics ()|> filter (.metric== "rmse" )
lasso_rmse = final_lasso %>% collect_metrics ()|> filter (.metric== "rmse" )
reducing dimensionality
reduce model complexity by:
reducing the number of predictors (subset selection )
penalizing the model coefficients (shrinkage )
replacing the original predictors with a reduced set of linear combinations (dimension reduction )
principal components regression
\({\hat y}_{i}=\sum_{j=1}^{p}{\hat\beta}_{j}x_{ij}\) is a linear combination
\(pc_{id}=\sum_{j=1}^{p}\phi_{jd}x_{ij}\) , \(d=1,\ldots,D\) , is also a linear combination
One can regress \(y\) on \(D\) orthogonal \(pc\) ’s (\(D<<p\) )
\[\hat{y}_{i}=\sum_{d=1}^{D}{\theta}_{d}pc_{id}\]
by plugging the formula for \(pc_{id}\) , the previous becomes
\[\hat{y}_{i}=\sum_{d=1}^{D}\theta_{d}\underbrace{\sum_{j=1}^{p}\phi_{jd}x_{ij}}_{pc_{id}}=\sum_{j=1}^{p}\sum_{d=1}^{D}\theta_{d}\phi_{jd}x_{ij}
\]
set \({\hat\beta}_{j} = \sum_{d=1}^{D}\theta_{d}\phi_{jd}\) , the previous goes back to \(\hat{y}_{i}=\sum_{j=1}^{p}{\hat\beta}_{j}x_{ij}\)
basic implementation
Code
library ("pls" )
set.seed (123 )
main_split= initial_split (my_hitters, prop= 4 / 5 )
pcr_fit = pcr (salary ~ ., data= my_hitters, scale= TRUE , validation= "CV" , subset= main_split$ in_id)
pcr_pred = predict (pcr_fit,type = "response" ,newdata = my_hitters |> slice (- main_split$ in_id), ncomp = 5 )
pcr_results = tibble (estimate = pcr_pred,truth = my_hitters |> slice (- main_split$ in_id) |> pull (salary))
rmse (pcr_results,truth = truth, estimate = estimate)|> kbl (align= "l" )
tidymodels implementation
split the data up
Code
set.seed (123 )
hit_train= training (main_split)
hit_test= testing (main_split)
hit_flds = vfold_cv (data = hit_train,v= 5 )
define the recipe: standardize the predictors first, and then compute the principal components
Code
pcr_recipe = recipe (x= hit_train,formula = salary~ .) |>
step_normalize (all_numeric_predictors ()) |>
step_pca (all_numeric_predictors (),num_comp = tune ())
model spec and workflow (nothing changes, the number of components is the hyperparameters)
Code
pcr_mod_spec= linear_reg (mode= "regression" ,engine = "lm" )
pcr_wflow = workflow () |> add_recipe (pcr_recipe) |> add_model (pcr_mod_spec)
tidymodels implementation
set the hyperparameter grid and fit the model
Code
hparm_grid = tibble (num_comp= 1 : 10 )
pcr_fit = pcr_wflow |> tune_grid (resamples = hit_flds, grid = hparm_grid)
Code
# pcr_fit |> collect_metrics() |> filter(.metric=="rmse")
pcr_fit |> autoplot (metric = "rmse" )
select the tuned pcr fit
Code
best_pcr = pcr_fit |> select_best (metric = "rmse" )
final_pcr = finalize_workflow (x = pcr_wflow,parameters = best_pcr) |> last_fit (main_split)
pcr_rmse = final_pcr |> collect_metrics () |> filter (.metric== "rmse" )
Code
pcr_rmse |> kbl (align= "l" )
rmse
standard
381.8252
Preprocessor1_Model1
partial least squares regression
the PC’s are linear combinations of \(X_{j}\) ’s built to approximate the data correlation structure, irrespective to the response \(Y\)
in PLS regression, instead, linear combinations are defined in a supervised way (that is, taking into account \(Y\) )
each of the weights \(\phi_{j1}\) of the first linear combination is the slope of the regression of \(Y\) on \(X_{j}\)
then \(pc_{1}=\sum_{j=1}^{p}{\phi_{j1}X_{j}}\) as usual
the next component (linear combination) \(pc_{2}\) is computed in the same way as \(pc_{1}\) : unlike the \(pc_{1}\) case, however, the \(X_{j}\) ’s are first orthogonalised with respect to \(pc_{1}\)
\(X_{j}^{orth}\) is the residual of the regression of \(X_{j}\) on \(pc_{1}\) \(->\) \(X_{j}^{orth}=X_{j}-\beta_{j}pc_{1}\)
basic implementation
Code
library ("pls" )
set.seed (123 )
main_split= initial_split (my_hitters, prop= 4 / 5 )
plsr_fit = plsr (salary ~ ., data= my_hitters, scale= TRUE , validation= "CV" , subset= main_split$ in_id)
plsr_pred = predict (plsr_fit,type = "response" ,newdata = my_hitters |> slice (- main_split$ in_id), ncomp = 5 )
plsr_results = tibble (estimate = plsr_pred,truth = my_hitters |> slice (- main_split$ in_id) |> pull (salary))
Code
rmse (plsr_results,truth = truth, estimate = estimate)|> kbl (align= "l" )
tidymodels implementation
split the data up
Code
set.seed (123 )
hit_train= training (main_split)
hit_test= testing (main_split)
hit_flds = vfold_cv (data = hit_train,v= 5 )
define the recipe: standardize the predictors first, and then compute the principal components
Code
# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("mixOmics")
plsr_recipe = recipe (x= hit_train,formula = salary~ .) |>
step_pls (all_numeric_predictors (),outcome= "salary" ,num_comp = tune ())
model spec and workflow (nothing changes, the number of components is the hyperparameters)
Code
plsr_mod_spec= linear_reg (mode= "regression" ,engine = "lm" )
plsr_wflow = workflow () |> add_recipe (plsr_recipe) |> add_model (plsr_mod_spec)
tidymodels implementation
set the hyperparameter grid and fit the model
Code
hparm_grid = tibble (num_comp= 1 : 10 )
plsr_fit = plsr_wflow |> tune_grid (resamples = hit_flds, grid = hparm_grid)
Code
plsr_fit |> autoplot (metric = "rmse" )
select the tuned plsr fit
Code
best_plsr = plsr_fit |> select_best (metric = "rmse" )
final_plsr = finalize_workflow (x = plsr_wflow,parameters = best_plsr) |>
last_fit (main_split)
plsr_rmse = final_plsr |> collect_metrics () |>
filter (.metric== "rmse" )
Code
plsr_rmse |> kbl (align= "l" )
rmse
standard
376.6424
Preprocessor1_Model1
wrap up
Code
ridge_rmse_value = ridge_rmse$ .estimate
lasso_rmse_value = lasso_rmse$ .estimate
rmse_all = tibble (
method= c ("step-wise" ,"lasso" ,"ridge" ,"PCr" ,"PLSr" ),
rmse_values = c (bsb_rmse,lasso_rmse_value,ridge_rmse_value,pcr_rmse$ .estimate,plsr_rmse$ .estimate)
)
rmse_all |>
arrange (rmse_values) |> mutate (method= fct_inorder (method)) |>
ggplot (aes (x= method,y= rmse_values,fill= method))+ geom_col (alpha= .5 )+ theme (legend.position = "none" )+ theme_bw ()